Since I want to develop myself in text mining and do not know much about it, I wanted to select interesting data which has already studied on Kaggle.
I want to classify Kaggle ex featured prediction competition data which is “Toxic Comment Classification Challenge”. This competition’s aim is to find negative online behaviors, like toxic comments (i.e. comments that are rude, disrespectful or otherwise likely to make someone leave a discussion). In this competition, I am going to challenge to build a multi-headed classifying model that’s capable of detecting different types of toxicity like threats, obscenity, insults, and identity-based hate. I combine them with one column and try to find comments which contain any of them.
You can reach competition page at the link below.
https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge
I will be using a dataset of comments from Wikipedia’s talk page edits which is provided by Kaggle for competition. You can reach data at the link below.
https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge/data
import time
import warnings
import pandas as pd, numpy as np
%matplotlib inline
from sklearn.utils import shuffle
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.gridspec as gridspec
#color = sns.color_palette()
#from wordcloud import WordCloud ,STOPWORDS
#from PIL import Image
import re
from nltk.tokenize import TweetTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
color = sns.color_palette()
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
#import locale;
#print(locale.getdefaultlocale());
from IPython.display import Image
from IPython.core.display import HTML
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import MultinomialNB
from scipy.sparse import hstack
nltk.download('wordnet')
nltk.download('stopwords')
eng_stopwords = set(stopwords.words("english"))
warnings.filterwarnings("ignore")
tokenizer=TweetTokenizer()
lem = WordNetLemmatizer()
df = pd.read_csv('/Users/yetkineser/Desktop/BDA 502/project/data/train.csv')
df = shuffle(df,random_state=7)
df_others = df.iloc[20000:,]
df = df.iloc[:20000,]
df = df.reset_index(drop=True)
df_others = df_others.reset_index(drop=True)
df.head(15)
df['comment_text'][3]
df['comment_text'][7]
lengths = df.comment_text.str.len()
lengths.mean(), lengths.std(), lengths.max()
lengths.hist();
label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
df['none'] = 1-df[label_cols].max(axis=1)
df['any'] = df[label_cols].max(axis=1)
df.describe()
label_cols = ['any','toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
for col in label_cols:
count = df.groupby(col)['any'].count()
new_df = pd.concat([count], axis=1)
new_df.columns = ['count']
display(new_df.sort_values(by=['count'],ascending=False))
print("Total comments = ",len(df))
print("Total clean comments = ",len(df)-df['any'].sum())
x=df.iloc[:,2:10].sum()
#plot
plt.figure(figsize=(8,4))
ax= sns.barplot(x.index, x.values, alpha=0.8)
plt.title("# per class")
plt.ylabel('# of Occurrences', fontsize=12)
plt.xlabel('Type ', fontsize=12)
#adding the text labels
rects = ax.patches
labels = x.values
for rect, label in zip(rects, labels):
height = rect.get_height()
ax.text(rect.get_x() + rect.get_width()/2, height + 5, label, ha='center', va='bottom')
plt.show()
rowsums=df.iloc[:,2:].sum(axis=1)
x=rowsums.value_counts()
#plot
plt.figure(figsize=(8,4))
ax = sns.barplot(x.index, x.values, alpha=0.8,color=color[2])
plt.title("Multiple tags per comment")
plt.ylabel('# of Occurrences', fontsize=12)
plt.xlabel('# of tags ', fontsize=12)
#adding the text labels
rects = ax.patches
labels = x.values
for rect, label in zip(rects, labels):
height = rect.get_height()
ax.text(rect.get_x() + rect.get_width()/2, height + 5, label, ha='center', va='bottom')
plt.show()
print("Check for missing values in Train dataset")
null_check=df.isnull().sum()
print(null_check)
There are not null values in our dataset.
Looking first five rows in our dataset
df.head()
temp_df=df.iloc[:,2:-3]
# filter temp by removing clean comments
# temp_df=temp_df[~train.clean]
corr=temp_df.corr()
plt.figure(figsize=(10,8))
sns.heatmap(corr,
xticklabels=corr.columns.values,
yticklabels=corr.columns.values, annot=True)
PATH = "/Users/yetkineser/Desktop/BDA 502/project/photos/"
Image(filename = PATH + "crossvalidation.png", width=800, height=600)
total_rows = (len(df))
train_rows = round(0.8*total_rows)
train = df.iloc[:train_rows,]
train_others = train # i added this later
test = df.iloc[train_rows-total_rows:,]
train_2=train.iloc[:,0:2]
test_2=test.iloc[:,0:2]
df_2=df.iloc[:,0:2]
df_2=df_2.reset_index(drop=True)
print("- I have ",total_rows, " rows on my data set ")
print("- I have ",train_rows, " rows on my first train set ")
print("- I have ",total_rows-train_rows, " rows on my test set ")
#https://drive.google.com/file/d/0B1yuv8YaUVlZZ1RzMFJmc1ZsQmM/view
# Aphost lookup dict
APPO = {
"aren't" : "are not",
"can't" : "can not",
"couldn't" : "could not",
"didn't" : "did not",
"doesn't" : "does not",
"don't" : "do not",
"hadn't" : "had not",
"hasn't" : "has not",
"haven't" : "have not",
"he'd" : "he would",
"he'll" : "he will",
"he's" : "he is",
"i'd" : "I would",
"i'd" : "I had",
"i'll" : "I will",
"i'm" : "I am",
"isn't" : "is not",
"it's" : "it is",
"it'll":"it will",
"i've" : "I have",
"let's" : "let us",
"mightn't" : "might not",
"mustn't" : "must not",
"shan't" : "shall not",
"she'd" : "she would",
"she'll" : "she will",
"she's" : "she is",
"shouldn't" : "should not",
"that's" : "that is",
"there's" : "there is",
"they'd" : "they would",
"they'll" : "they will",
"they're" : "they are",
"they've" : "they have",
"we'd" : "we would",
"we're" : "we are",
"weren't" : "were not",
"we've" : "we have",
"what'll" : "what will",
"what're" : "what are",
"what's" : "what is",
"what've" : "what have",
"where's" : "where is",
"who'd" : "who would",
"who'll" : "who will",
"who're" : "who are",
"who's" : "who is",
"who've" : "who have",
"won't" : "will not",
"wouldn't" : "would not",
"you'd" : "you would",
"you'll" : "you will",
"you're" : "you are",
"you've" : "you have",
"'re": " are",
"wasn't": "was not",
"we'll":" will",
"didn't": "did not",
"tryin'":"trying"
}
corpus=df_2.comment_text
train_text = train_2.comment_text
test_text = test_2.comment_text
def clean(comment):
"""
This function receives comments and returns clean word-list
"""
#Convert to lower case , so that Hi and hi are the same
comment=comment.lower()
#remove \n
comment=re.sub("\\n"," ",comment)
comment=re.sub("/"," ",comment) # added
comment=re.sub("•"," ",comment) # added
# remove leaky elements like ip,user
comment=re.sub("\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}","",comment)
#removing usernames
comment=re.sub("\[\[.*\]","",comment)
#Split the sentences into words
words=tokenizer.tokenize(comment)
# (')aphostophe replacement (ie) you're --> you are
# ( basic dictionary lookup : master dictionary present in a hidden block of code)
words=[APPO[word] if word in APPO else word for word in words]
words=[lem.lemmatize(word, "v") for word in words]
words = [w for w in words if not w in eng_stopwords]
clean_sent=" ".join(words)
# remove any non alphanum,digit character
clean_sent=re.sub("\W+"," ",clean_sent)
clean_sent=re.sub(" "," ",clean_sent)
clean_sent=re.sub(r'[0-9]+', '', clean_sent)
return(clean_sent)
corpus.iloc[5]
clean(corpus.iloc[5])
clean_corpus=corpus.apply(lambda x :clean(x))
def top_feats_by_class(Xtr, y, features, min_tfidf=0.1, top_n=25):
''' Return a list of dfs, where each df holds top_n features and their mean tfidf value
calculated across documents with the same class label. '''
dfs = []
labels = np.unique(y)
for label in labels:
ids = np.where(y==label)
feats_df = top_mean_feats(Xtr, features, ids, min_tfidf=min_tfidf, top_n=top_n)
feats_df.label = label
dfs.append(feats_df)
return dfs
tfv = TfidfVectorizer(min_df=200, max_features=10000,
strip_accents='unicode', analyzer='word',ngram_range=(1,1),
use_idf=1,smooth_idf=1,sublinear_tf=1,
stop_words = 'english')
tfv.fit(clean_corpus)
features = np.array(tfv.get_feature_names())
df_unigrams = tfv.transform(clean_corpus.iloc[:df.shape[0]])
#serperate train and test features
df_feats=df.iloc[0:len(df),]
#join the tags
df_tags=df.iloc[:,2:]
df_feats=pd.concat([df_feats,df_tags],axis=1)
def top_tfidf_feats(row, features, top_n=50):
''' Get top n tfidf values in row and return them with their corresponding feature names.'''
topn_ids = np.argsort(row)[::-1][:top_n]
top_feats = [(features[i], row[i]) for i in topn_ids]
df = pd.DataFrame(top_feats)
df.columns = ['feature', 'tfidf']
return df
def top_feats_in_doc(Xtr, features, row_id, top_n=50):
''' Top tfidf features in specific document (matrix row) '''
row = np.squeeze(Xtr[row_id].toarray())
return top_tfidf_feats(row, features, top_n)
def top_mean_feats(Xtr, features, grp_ids, min_tfidf=0.1, top_n=50):
''' Return the top n features that on average are most important amongst documents in rows
indentified by indices in grp_ids. '''
D = Xtr[grp_ids].toarray()
D[D < min_tfidf] = 0
tfidf_means = np.mean(D, axis=0)
return top_tfidf_feats(tfidf_means, features, top_n)
# modified for multilabel milticlass
def top_feats_by_class(Xtr, features, min_tfidf=0.005, top_n=50):
''' Return a list of dfs, where each df holds top_n features and their mean tfidf value
calculated across documents with the same class label. '''
dfs = []
cols=df_tags.columns
for col in cols:
ids = df_tags.index[df_tags[col]==1]
feats_df = top_mean_feats(Xtr, features, ids, min_tfidf=min_tfidf, top_n=top_n)
feats_df.label = label
dfs.append(feats_df)
return dfs
tfidf_top_n_per_lass=top_feats_by_class(df_unigrams,features)
from sklearn.feature_extraction.text import TfidfVectorizer
word_vectorizer = TfidfVectorizer(
sublinear_tf=True,
strip_accents='unicode',
analyzer='word',
token_pattern=r'\w{1,}',
stop_words='english',
ngram_range=(1, 1),
max_features=10000)
word_vectorizer.fit(corpus)
train_word_features = word_vectorizer.transform(train_text)
test_word_features = word_vectorizer.transform(test_text)
char_vectorizer = TfidfVectorizer(
sublinear_tf=True,
strip_accents='unicode',
analyzer='char',
stop_words='english',
ngram_range=(2, 6),
max_features=50000)
char_vectorizer.fit(corpus)
train_char_features = char_vectorizer.transform(train_text)
test_char_features = char_vectorizer.transform(test_text)
Image(filename = PATH + "metrics.png", width=700, height=700)
Image(filename = PATH + "logistic regression.png", width=700, height=600)
from sklearn.linear_model import LogisticRegression
class_names = ['any']
from sklearn.model_selection import cross_val_score
from scipy.sparse import hstack
train_features = hstack([train_char_features, train_word_features])
test_features = hstack([test_char_features, test_word_features])
scores = []
scores2 = []
scores3 = []
test_pred = pd.DataFrame.from_dict({'id': test['id']})
train_pred = pd.DataFrame.from_dict({'id': train['id']})
time1=time.time()
for class_name in class_names:
train_target = train[class_name]
classifier = LogisticRegression(C=5, solver='sag')
cv_score = cross_val_score(classifier, train_features, train_target, cv=5, scoring='accuracy')
scores.append(cv_score)
cv_score = cross_val_score(classifier, train_features, train_target, cv=5, scoring='precision')
scores2.append(cv_score)
cv_score = cross_val_score(classifier, train_features, train_target, cv=5, scoring='recall')
scores3.append(cv_score)
time2=time.time()
print('Average CV accuracy is {}'.format(round(np.mean(scores),5)))
print('Standard Deviation of CV accuracy is {}'.format(round(np.std(scores),5)))
print('Average CV precion is {}'.format(round(np.mean(scores2),5)))
print('Standard Deviation of CV precision is {}'.format(round(np.std(scores2),5)))
print('Average CV recall is {}'.format(round(np.mean(scores3),5)))
print('Standard Deviation of CV recall is {}'.format(round(np.std(scores3),5)))
print("Time of cross validation",round(time2-time1,5))
classifier = LogisticRegression(C=5, solver='sag')
time1=time.time()
classifier.fit(train_features, train_target)
time2=time.time()
train_pred[class_name] = classifier.predict_proba(train_features)[:, 1]
time3=time.time()
test_pred[class_name] = classifier.predict_proba(test_features)[:, 1]
time4=time.time()
print("Training time = ",round(time2-time1,5))
print("Test time = ",round(time4-time3,5))
test["pred"]=test_pred["any"]>0.5
train["pred"]=train_pred["any"]>0.5
print("Accuracy score of train set : ",round(accuracy_score(train["any"], train["pred"]),5))
print("Accuracy score of test set : ",round(accuracy_score(test["any"], test["pred"]),5))
print("Precision score of train set : ",round(precision_score(train["any"], train["pred"]),5))
print("Precision score of test set : ",round(precision_score(test["any"], test["pred"]),5))
print("Recall score of train set : ",round(recall_score(train["any"], train["pred"]),5))
print("Recall score of test set : ",round(recall_score(test["any"], test["pred"]),5))
# calculate the fpr and tpr for all thresholds of the classification
from sklearn import metrics
preds = test["pred"]
fpr, tpr, threshold = metrics.roc_curve(test["any"], test_pred["any"])
fpr_t, tpr_t, threshold_t = metrics.roc_curve(train["any"], train_pred["any"])
roc_auc = metrics.auc(fpr, tpr)
roc_auc_t = metrics.auc(fpr_t, tpr_t)
# method I: plt
import matplotlib.pyplot as plt
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'TEST_AUC = %0.5f' % roc_auc)
plt.plot(fpr_t, tpr_t, 'r', label = 'TRAIN_AUC = %0.5f' % roc_auc_t)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
confusion_matrix_train = confusion_matrix(train["any"], train["pred"])
print(confusion_matrix_train)
confusion_matrix_test = confusion_matrix(test["any"], test["pred"])
print(confusion_matrix_test)
class_names = ['any']
train_features = hstack([train_char_features, train_word_features])
test_features = hstack([test_char_features, test_word_features])
scores = []
scores2 = []
scores3 = []
test_pred = pd.DataFrame.from_dict({'id': test['id']})
train_pred = pd.DataFrame.from_dict({'id': train['id']})
time1=time.time()
for class_name in class_names:
train_target = train[class_name]
classifier = MultinomialNB()
cv_score = cross_val_score(classifier, train_features, train_target, cv=5, scoring='accuracy')
scores.append(cv_score)
cv_score = cross_val_score(classifier, train_features, train_target, cv=5, scoring='precision')
scores2.append(cv_score)
cv_score = cross_val_score(classifier, train_features, train_target, cv=5, scoring='recall')
scores3.append(cv_score)
time2=time.time()
print('Average CV accuracy is {}'.format(round(np.mean(scores),5)))
print('Standard Deviation of CV accuracy is {}'.format(round(np.std(scores),5)))
print('Average CV precion is {}'.format(round(np.mean(scores2),5)))
print('Standard Deviation of CV precision is {}'.format(round(np.std(scores2),5)))
print('Average CV recall is {}'.format(round(np.mean(scores3),5)))
print('Standard Deviation of CV recall is {}'.format(round(np.std(scores3),5)))
print("Time of cross validation",round(time2-time1,5))
classifier = MultinomialNB()
time1=time.time()
classifier.fit(train_features, train_target)
time2=time.time()
train_pred[class_name] = classifier.predict_proba(train_features)[:, 1]
time3=time.time()
test_pred[class_name] = classifier.predict_proba(test_features)[:, 1]
time4=time.time()
print("Training time = ",round(time2-time1,5))
print("Test time = ",round(time4-time3,5))
test["pred"]=test_pred["any"]>0.5
train["pred"]=train_pred["any"]>0.5
print("Accuracy score of train set : ",round(accuracy_score(train["any"], train["pred"]),5))
print("Accuracy score of test set : ",round(accuracy_score(test["any"], test["pred"]),5))
print("Precision score of train set : ",round(precision_score(train["any"], train["pred"]),5))
print("Precision score of test set : ",round(precision_score(test["any"], test["pred"]),5))
print("Recall score of train set : ",round(recall_score(train["any"], train["pred"]),5))
print("Recall score of test set : ",round(recall_score(test["any"], test["pred"]),5))
# calculate the fpr and tpr for all thresholds of the classification
from sklearn import metrics
preds = test["pred"]
fpr, tpr, threshold = metrics.roc_curve(test["any"], test_pred["any"])
fpr_t, tpr_t, threshold_t = metrics.roc_curve(train["any"], train_pred["any"])
roc_auc = metrics.auc(fpr, tpr)
roc_auc_t = metrics.auc(fpr_t, tpr_t)
# method I: plt
import matplotlib.pyplot as plt
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'TEST_AUC = %0.5f' % roc_auc)
plt.plot(fpr_t, tpr_t, 'r', label = 'TRAIN_AUC = %0.5f' % roc_auc_t)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
confusion_matrix_train = confusion_matrix(train["any"], train["pred"])
print(confusion_matrix_train)
confusion_matrix_test = confusion_matrix(test["any"], test["pred"])
print(confusion_matrix_test)
class_names = ['any']
train_features = hstack([train_char_features, train_word_features])
test_features = hstack([test_char_features, test_word_features])
scores = []
scores2 = []
scores3 = []
test_pred = pd.DataFrame.from_dict({'id': test['id']})
train_pred = pd.DataFrame.from_dict({'id': train['id']})
time1=time.time()
for class_name in class_names:
train_target = train[class_name]
classifier = AdaBoostClassifier()
cv_score = cross_val_score(classifier, train_features, train_target, cv=5, scoring='accuracy')
scores.append(cv_score)
cv_score = cross_val_score(classifier, train_features, train_target, cv=5, scoring='precision')
scores2.append(cv_score)
cv_score = cross_val_score(classifier, train_features, train_target, cv=5, scoring='recall')
scores3.append(cv_score)
time2=time.time()
print('Average CV accuracy is {}'.format(round(np.mean(scores),5)))
print('Standard Deviation of CV accuracy is {}'.format(round(np.std(scores),5)))
print('Average CV precion is {}'.format(round(np.mean(scores2),5)))
print('Standard Deviation of CV precision is {}'.format(round(np.std(scores2),5)))
print('Average CV recall is {}'.format(round(np.mean(scores3),5)))
print('Standard Deviation of CV recall is {}'.format(round(np.std(scores3),5)))
print("Time of cross validation",round(time2-time1,5))
classifier = AdaBoostClassifier()
time1=time.time()
classifier.fit(train_features, train_target)
time2=time.time()
train_pred[class_name] = classifier.predict_proba(train_features)[:, 1]
time3=time.time()
test_pred[class_name] = classifier.predict_proba(test_features)[:, 1]
time4=time.time()
print("Training time = ",round(time2-time1,5))
print("Test time = ",round(time4-time3,5))
test["pred"]=test_pred["any"]>0.5
train["pred"]=train_pred["any"]>0.5
print("Accuracy score of train set : ",round(accuracy_score(train["any"], train["pred"]),5))
print("Accuracy score of test set : ",round(accuracy_score(test["any"], test["pred"]),5))
print("Precision score of train set : ",round(precision_score(train["any"], train["pred"]),5))
print("Precision score of test set : ",round(precision_score(test["any"], test["pred"]),5))
print("Recall score of train set : ",round(recall_score(train["any"], train["pred"]),5))
print("Recall score of test set : ",round(recall_score(test["any"], test["pred"]),5))
# calculate the fpr and tpr for all thresholds of the classification
from sklearn import metrics
preds = test["pred"]
fpr, tpr, threshold = metrics.roc_curve(test["any"], test_pred["any"])
fpr_t, tpr_t, threshold_t = metrics.roc_curve(train["any"], train_pred["any"])
roc_auc = metrics.auc(fpr, tpr)
roc_auc_t = metrics.auc(fpr_t, tpr_t)
# method I: plt
import matplotlib.pyplot as plt
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'TEST_AUC = %0.5f' % roc_auc)
plt.plot(fpr_t, tpr_t, 'r', label = 'TRAIN_AUC = %0.5f' % roc_auc_t)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
confusion_matrix_train = confusion_matrix(train["any"], train["pred"])
print(confusion_matrix_train)
confusion_matrix_test = confusion_matrix(test["any"], test["pred"])
print(confusion_matrix_test)
Image(filename = PATH + "metricsresults1.png", width=800, height=600)
test_recall = test.loc[(test['any'] == 1) & (test['pred']==False)]
test_recall.ix[:, ["comment_text","any","pred"]]
test_recall = test_recall.reset_index(drop=True)
train_recall = train.loc[(train['any'] == 1) & (train['pred']==False)]
test_recall["comment_text"][7]
test_recall["comment_text"][3]
test_recall["comment_text"][12]
Image(filename = PATH + "steps2.png", width=800, height=600)
train_any_1 = train.loc[(train['any'] == 1)]
train_any_1.ix[:, ["comment_text","any","pred"]]
train_any_1 = train_any_1.reset_index(drop=True)
new_train = pd.concat([train, train_any_1, train_any_1, train_any_1,
train_any_1, train_any_1, train_any_1,
train_any_1,train_any_1, train_any_1, train_any_1])
train_rows = (len(new_train))
test_rows = (len(test))
train_rows = round(train_rows)
train = df.iloc[:train_rows,]
train_2=new_train.iloc[:,0:2]
test_2=test.iloc[:,0:2]
df = pd.concat([train_2,test_2])
df_2=df.iloc[:,0:2]
df_2=df_2.reset_index(drop=True)
print("- I have ",train_rows, " rows on my new regenerated train set ")
print("- I have ",test_rows, " rows on my test set ")
corpus=df_2.comment_text
train_text = train_2.comment_text
test_text = test_2.comment_text
clean_corpus=corpus.apply(lambda x :clean(x))
tfv = TfidfVectorizer(min_df=200, max_features=10000,
strip_accents='unicode', analyzer='word',ngram_range=(1,1),
use_idf=1,smooth_idf=1,sublinear_tf=1,
stop_words = 'english')
tfv.fit(clean_corpus)
features = np.array(tfv.get_feature_names())
df_unigrams = tfv.transform(clean_corpus.iloc[:df.shape[0]])
#serperate train and test features
df_feats=df.iloc[0:len(df),]
#join the tags
df_tags=df.iloc[:,2:]
df_feats=pd.concat([df_feats,df_tags],axis=1)
tfidf_top_n_per_lass=top_feats_by_class(df_unigrams,features)
from sklearn.feature_extraction.text import TfidfVectorizer
word_vectorizer = TfidfVectorizer(
sublinear_tf=True,
strip_accents='unicode',
analyzer='word',
token_pattern=r'\w{1,}',
stop_words='english',
ngram_range=(1, 1),
max_features=10000)
word_vectorizer.fit(corpus)
train_word_features = word_vectorizer.transform(train_text)
test_word_features = word_vectorizer.transform(test_text)
char_vectorizer = TfidfVectorizer(
sublinear_tf=True,
strip_accents='unicode',
analyzer='char',
stop_words='english',
ngram_range=(2, 6),
max_features=50000)
char_vectorizer.fit(corpus)
train_char_features = char_vectorizer.transform(train_text)
test_char_features = char_vectorizer.transform(test_text)
train_features = hstack([train_char_features, train_word_features])
test_features = hstack([test_char_features, test_word_features])
train_target = new_train[class_name]
test_pred = pd.DataFrame.from_dict({'id': test['id']})
train_pred = pd.DataFrame.from_dict({'id': new_train['id']})
classifier = LogisticRegression(C=5, solver='sag')
time1=time.time()
classifier.fit(train_features, train_target)
time2=time.time()
train_pred[class_name] = classifier.predict_proba(train_features)[:, 1]
time3=time.time()
test_pred[class_name] = classifier.predict_proba(test_features)[:, 1]
time4=time.time()
test["pred"]=test_pred["any"]>0.5
new_train["pred"]=train_pred["any"]>0.5
print("Training time = ",round(time2-time1,5))
print("Test time = ",round(time4-time3,5))
print("Accuracy score of train set : ",round(accuracy_score(new_train["any"], new_train["pred"]),5))
print("Accuracy score of test set : ",round(accuracy_score(test["any"], test["pred"]),5))
print("Precision score of train set : ",round(precision_score(new_train["any"], new_train["pred"]),5))
print("Precision score of test set : ",round(precision_score(test["any"], test["pred"]),5))
print("Recall score of train set : ",round(recall_score(new_train["any"], new_train["pred"]),5))
print("Recall score of test set : ",round(recall_score(test["any"], test["pred"]),5))
# calculate the fpr and tpr for all thresholds of the classification
from sklearn import metrics
preds = test["pred"]
fpr, tpr, threshold = metrics.roc_curve(test["any"], test_pred["any"])
fpr_t, tpr_t, threshold_t = metrics.roc_curve(new_train["any"], train_pred["any"])
roc_auc = metrics.auc(fpr, tpr)
roc_auc_t = metrics.auc(fpr_t, tpr_t)
# method I: plt
import matplotlib.pyplot as plt
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'TEST_AUC = %0.5f' % roc_auc)
plt.plot(fpr_t, tpr_t, 'r', label = 'TRAIN_AUC = %0.5f' % roc_auc_t)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
confusion_matrix_test = confusion_matrix(test["any"], test["pred"])
print(confusion_matrix_test)
classifier = MultinomialNB()
time1=time.time()
classifier.fit(train_features, train_target)
time2=time.time()
train_pred[class_name] = classifier.predict_proba(train_features)[:, 1]
time3=time.time()
test_pred[class_name] = classifier.predict_proba(test_features)[:, 1]
time4=time.time()
test["pred"]=test_pred["any"]>0.5
new_train["pred"]=train_pred["any"]>0.5
print("Training time = ",round(time2-time1,5))
print("Test time = ",round(time4-time3,5))
print("Accuracy score of train set : ",round(accuracy_score(new_train["any"], new_train["pred"]),5))
print("Accuracy score of test set : ",round(accuracy_score(test["any"], test["pred"]),5))
print("Precision score of train set : ",round(precision_score(new_train["any"], new_train["pred"]),5))
print("Precision score of test set : ",round(precision_score(test["any"], test["pred"]),5))
print("Recall score of train set : ",round(recall_score(new_train["any"], new_train["pred"]),5))
print("Recall score of test set : ",round(recall_score(test["any"], test["pred"]),5))
# calculate the fpr and tpr for all thresholds of the classification
from sklearn import metrics
preds = test["pred"]
fpr, tpr, threshold = metrics.roc_curve(test["any"], test_pred["any"])
fpr_t, tpr_t, threshold_t = metrics.roc_curve(new_train["any"], train_pred["any"])
roc_auc = metrics.auc(fpr, tpr)
roc_auc_t = metrics.auc(fpr_t, tpr_t)
# method I: plt
import matplotlib.pyplot as plt
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'TEST_AUC = %0.5f' % roc_auc)
plt.plot(fpr_t, tpr_t, 'r', label = 'TRAIN_AUC = %0.5f' % roc_auc_t)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
confusion_matrix_test = confusion_matrix(test["any"], test["pred"])
print(confusion_matrix_test)
classifier = AdaBoostClassifier()
time1=time.time()
classifier.fit(train_features, train_target)
time2=time.time()
train_pred[class_name] = classifier.predict_proba(train_features)[:, 1]
time3=time.time()
test_pred[class_name] = classifier.predict_proba(test_features)[:, 1]
time4=time.time()
test["pred"]=test_pred["any"]>0.5
new_train["pred"]=train_pred["any"]>0.5
print("Training time = ",round(time2-time1,5))
print("Test time = ",round(time4-time3,5))
print("Accuracy score of train set : ",round(accuracy_score(new_train["any"], new_train["pred"]),5))
print("Accuracy score of test set : ",round(accuracy_score(test["any"], test["pred"]),5))
print("Precision score of train set : ",round(precision_score(new_train["any"], new_train["pred"]),5))
print("Precision score of test set : ",round(precision_score(test["any"], test["pred"]),5))
print("Recall score of train set : ",round(recall_score(new_train["any"], new_train["pred"]),5))
print("Recall score of test set : ",round(recall_score(test["any"], test["pred"]),5))
# calculate the fpr and tpr for all thresholds of the classification
from sklearn import metrics
preds = test["pred"]
fpr, tpr, threshold = metrics.roc_curve(test["any"], test_pred["any"])
fpr_t, tpr_t, threshold_t = metrics.roc_curve(new_train["any"], train_pred["any"])
roc_auc = metrics.auc(fpr, tpr)
roc_auc_t = metrics.auc(fpr_t, tpr_t)
# method I: plt
import matplotlib.pyplot as plt
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'TEST_AUC = %0.5f' % roc_auc)
plt.plot(fpr_t, tpr_t, 'r', label = 'TRAIN_AUC = %0.5f' % roc_auc_t)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
confusion_matrix_test = confusion_matrix(test["any"], test["pred"])
print(confusion_matrix_test)
Image(filename = PATH + "results2.png", width=800, height=600)
len(df_others)
label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
df_others['none'] = 1-df_others[label_cols].max(axis=1)
df_others['any'] = df_others[label_cols].max(axis=1)
df_others_any_1 = df_others.loc[(df_others['any'] == 1)]
df_others_any_1.ix[:, ["comment_text","any","pred"]]
df_others_any_1 = df_others_any_1.reset_index(drop=True)
len(df_others_any_1)
new_train_2 = pd.concat([train_others, df_others_any_1])
len(new_train_2)
train_rows = (len(new_train_2))
test_rows = (len(test))
train_rows = round(train_rows)
train = df.iloc[:train_rows,]
train_2=new_train_2.iloc[:,0:2]
test_2=test.iloc[:,0:2]
df = pd.concat([train_2,test_2])
df_2=df.iloc[:,0:2]
df_2=df_2.reset_index(drop=True)
print("- I have ",train_rows, " rows on my new regenerated train set ")
print("- I have ",test_rows, " rows on my test set ")
corpus=df_2.comment_text
train_text = train_2.comment_text
test_text = test_2.comment_text
clean_corpus=corpus.apply(lambda x :clean(x))
tfv = TfidfVectorizer(min_df=200, max_features=10000,
strip_accents='unicode', analyzer='word',ngram_range=(1,1),
use_idf=1,smooth_idf=1,sublinear_tf=1,
stop_words = 'english')
tfv.fit(clean_corpus)
features = np.array(tfv.get_feature_names())
df_unigrams = tfv.transform(clean_corpus.iloc[:df.shape[0]])
#serperate train and test features
df_feats=df.iloc[0:len(df),]
#join the tags
df_tags=df.iloc[:,2:]
df_feats=pd.concat([df_feats,df_tags],axis=1)
tfidf_top_n_per_lass=top_feats_by_class(df_unigrams,features)
word_vectorizer = TfidfVectorizer(
sublinear_tf=True,
strip_accents='unicode',
analyzer='word',
token_pattern=r'\w{1,}',
stop_words='english',
ngram_range=(1, 1),
max_features=10000)
word_vectorizer.fit(corpus)
train_word_features = word_vectorizer.transform(train_text)
test_word_features = word_vectorizer.transform(test_text)
char_vectorizer = TfidfVectorizer(
sublinear_tf=True,
strip_accents='unicode',
analyzer='char',
stop_words='english',
ngram_range=(2, 6),
max_features=50000)
char_vectorizer.fit(corpus)
train_char_features = char_vectorizer.transform(train_text)
test_char_features = char_vectorizer.transform(test_text)
train_features = hstack([train_char_features, train_word_features])
test_features = hstack([test_char_features, test_word_features])
train_target = new_train_2[class_name]
test_pred = pd.DataFrame.from_dict({'id': test['id']})
train_pred = pd.DataFrame.from_dict({'id': new_train_2['id']})
classifier = LogisticRegression(C=5, solver='sag')
time1=time.time()
classifier.fit(train_features, train_target)
time2=time.time()
train_pred[class_name] = classifier.predict_proba(train_features)[:, 1]
time3=time.time()
test_pred[class_name] = classifier.predict_proba(test_features)[:, 1]
time4=time.time()
test["pred"]=test_pred["any"]>0.5
new_train_2["pred"]=train_pred["any"]>0.5
print("Training time = ",round(time2-time1,5))
print("Test time = ",round(time4-time3,5))
print("Accuracy score of train set : ",round(accuracy_score(new_train_2["any"], new_train_2["pred"]),5))
print("Accuracy score of test set : ",round(accuracy_score(test["any"], test["pred"]),5))
print("Precision score of train set : ",round(precision_score(new_train_2["any"], new_train_2["pred"]),5))
print("Precision score of test set : ",round(precision_score(test["any"], test["pred"]),5))
print("Recall score of train set : ",round(recall_score(new_train_2["any"], new_train_2["pred"]),5))
print("Recall score of test set : ",round(recall_score(test["any"], test["pred"]),5))
# calculate the fpr and tpr for all thresholds of the classification
from sklearn import metrics
preds = test["pred"]
fpr, tpr, threshold = metrics.roc_curve(test["any"], test_pred["any"])
fpr_t, tpr_t, threshold_t = metrics.roc_curve(new_train_2["any"], train_pred["any"])
roc_auc = metrics.auc(fpr, tpr)
roc_auc_t = metrics.auc(fpr_t, tpr_t)
# method I: plt
import matplotlib.pyplot as plt
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'TEST_AUC = %0.5f' % roc_auc)
plt.plot(fpr_t, tpr_t, 'r', label = 'TRAIN_AUC = %0.5f' % roc_auc_t)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
classifier = MultinomialNB()
time1=time.time()
classifier.fit(train_features, train_target)
time2=time.time()
train_pred[class_name] = classifier.predict_proba(train_features)[:, 1]
time3=time.time()
test_pred[class_name] = classifier.predict_proba(test_features)[:, 1]
time4=time.time()
test["pred"]=test_pred["any"]>0.5
new_train_2["pred"]=train_pred["any"]>0.5
print("Training time = ",round(time2-time1,5))
print("Test time = ",round(time4-time3,5))
print("Accuracy score of train set : ",round(accuracy_score(new_train_2["any"], new_train_2["pred"]),5))
print("Accuracy score of test set : ",round(accuracy_score(test["any"], test["pred"]),5))
print("Precision score of train set : ",round(precision_score(new_train_2["any"], new_train_2["pred"]),5))
print("Precision score of test set : ",round(precision_score(test["any"], test["pred"]),5))
print("Recall score of train set : ",round(recall_score(new_train_2["any"], new_train_2["pred"]),5))
print("Recall score of test set : ",round(recall_score(test["any"], test["pred"]),5))
# calculate the fpr and tpr for all thresholds of the classification
from sklearn import metrics
preds = test["pred"]
fpr, tpr, threshold = metrics.roc_curve(test["any"], test_pred["any"])
fpr_t, tpr_t, threshold_t = metrics.roc_curve(new_train_2["any"], train_pred["any"])
roc_auc = metrics.auc(fpr, tpr)
roc_auc_t = metrics.auc(fpr_t, tpr_t)
# method I: plt
import matplotlib.pyplot as plt
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'TEST_AUC = %0.5f' % roc_auc)
plt.plot(fpr_t, tpr_t, 'r', label = 'TRAIN_AUC = %0.5f' % roc_auc_t)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
classifier = AdaBoostClassifier()
time1=time.time()
classifier.fit(train_features, train_target)
time2=time.time()
train_pred[class_name] = classifier.predict_proba(train_features)[:, 1]
time3=time.time()
test_pred[class_name] = classifier.predict_proba(test_features)[:, 1]
time4=time.time()
test["pred"]=test_pred["any"]>0.5
new_train_2["pred"]=train_pred["any"]>0.5
print("Training time = ",round(time2-time1,5))
print("Test time = ",round(time4-time3,5))
print("Accuracy score of train set : ",round(accuracy_score(new_train_2["any"], new_train_2["pred"]),5))
print("Accuracy score of test set : ",round(accuracy_score(test["any"], test["pred"]),5))
print("Precision score of train set : ",round(precision_score(new_train_2["any"], new_train_2["pred"]),5))
print("Precision score of test set : ",round(precision_score(test["any"], test["pred"]),5))
print("Recall score of train set : ",round(recall_score(new_train_2["any"], new_train_2["pred"]),5))
print("Recall score of test set : ",round(recall_score(test["any"], test["pred"]),5))
# calculate the fpr and tpr for all thresholds of the classification
from sklearn import metrics
preds = test["pred"]
fpr, tpr, threshold = metrics.roc_curve(test["any"], test_pred["any"])
fpr_t, tpr_t, threshold_t = metrics.roc_curve(new_train_2["any"], train_pred["any"])
roc_auc = metrics.auc(fpr, tpr)
roc_auc_t = metrics.auc(fpr_t, tpr_t)
# method I: plt
import matplotlib.pyplot as plt
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'TEST_AUC = %0.5f' % roc_auc)
plt.plot(fpr_t, tpr_t, 'r', label = 'TRAIN_AUC = %0.5f' % roc_auc_t)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
Image(filename = PATH + "results3.png", width=800, height=600)